library(NbClust)
library(factoextra)
library(ClusterR)
library(ggplot2)
library(scatterplot3d)
library(dplyr)
library(stringr)
library(VIM)
library(fpc)
library(plotly)
data1 <- read.csv('Data1.csv')
data2 <- read.csv('Data2.csv')
data3 <- read.csv('Data3.csv')
data4 <- read.csv('Data4.csv')
data5 <- read.csv('Data5.csv')
data6 <- read.csv('Data6.csv')
data7 <- read.csv('Data7.csv')
data8 <- read.csv('Data8.csv')
world_data <- read.csv('World Indicators.csv')
We check K value recommendation from elbow method and silhouette method
set.seed(10)
fviz_nbclust(data1[,2:4], kmeans, method = 'wss')
fviz_nbclust(data1[,2:4], kmeans, method = 'silhouette')
K suggested by elbow and silhouette method = 6
#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
km <- kmeans(data1[, 2:4], i, nstart = 20)
ch <- round(calinhara(data1[, 2:4],km$cluster),digits=2)
print(paste(i,ch))
}
## [1] "4 98.68"
## [1] "5 146.82"
## [1] "6 262.71"
## [1] "7 225.1"
## [1] "8 478.81"
CH value for k = 7 is greater than CH value for k = 6, so we select k = 6
km <- kmeans(data1[, 2:4], 7, nstart = 20)
data1$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data1$Class, clusters = data1$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1, purity = 1 for our clustering solution
dist_matrix <- as.matrix(dist(data1[,2:4]))
hc.single <- hclust(dist(data1[,2:4]), method = 'single')
plot(hc.single)
From dendrogram, we can set number of clusters = 7
data1$h_cluster <- cutree(hc.single, 7)
external_validation(true_labels = data1$Class, clusters = data1$h_cluster, method = 'jaccard_index', summary_stats = T)
##
## ----------------------------------------
## purity : 1
## entropy : 0
## normalized mutual information : 1
## variation of information : 0
## normalized var. of information : 0
## ----------------------------------------
## specificity : 1
## sensitivity : 1
## precision : 1
## recall : 1
## F-measure : 1
## ----------------------------------------
## accuracy OR rand-index : 1
## adjusted-rand-index : 1
## jaccard-index : 1
## fowlkes-mallows-index : 1
## mirkin-metric : 0
## ----------------------------------------
## [1] 1
Jaccard index = 1 and purity = 1 for our hierarchichal clustering ## Plotting data according to actual class, Kmeans clustering and Hierarchical clustering
#3d plotting according to actual class
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'X3')))
fig